128 ◾ Bioinformatics
#2- download and extract the reference genome
mkdir ref
cd ref
wget https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/009/858/895/
GCF_009858895.2_ASM985889v3/GCF_009858895.2_ASM985889v3_genomic.
fna.gz
#Extract the compressed the reference FASTA file
f=$(ls *.*)
gzip -d ${f}
#3- Index the reference FASTA file using samtools and bwa
f=$(ls *.*)
samtools faidx ${f}
bwa index ${f}
cd ..
#4- Align the fastq reads (multiple samples) to the reference
genome
mkdir sam
cd fastq
for i in $(ls *.fastq | rev | cut -c 9- | rev | uniq);
do
bwa mem -M -t 4 \
-R “@RG\tID:${i}\tSM:${i}” \
../ref/GCF_009858895.2_ASM985889v3_genomic.fna \
${i}_1.fastq ${i}_2.fastq > \
../sam/${i}.sam 2> ../sam/${i}.log;
done
cd ..
#5- convert SAM files into BAM files
mkdir bam
cd sam
for i in $(ls *.sam | rev | cut -c 5- | rev);
do
samtools view -uS -o ../bam/${i}.bam ${i}.sam
done
cd ..
#6- Sorting and indexing BAM files
#Sorting BAM files
mkdir sortedbam
cd bam
for i in $(ls *.bam);
do
samtools sort -T ../sortedbam/tmp.sort -o ../sortedbam/${i}
${i}
done
cd ..
#Indexing the sorted BAM files
cd sortedbam